Q1
First I load the data files
Next I reshape dat into the handful data format. I use tidyr package for the aim instead of reshape package because the foramer is the improved version of the latter.
install.packages('tidyverse')
URL 'https://cran.rstudio.com/bin/macosx/mavericks/contrib/3.3/tidyverse_1.1.1.tgz' を試しています
Content type 'application/x-gzip' length 37228 bytes (36 KB)
==================================================
downloaded 36 KB
The downloaded binary packages are in
/var/folders/1w/nyst5xl16t329h4gpz0j49hm0000gn/T//Rtmp2exVAC/downloaded_packages
library(tidyverse)
Loading tidyverse: ggplot2
Loading tidyverse: tibble
Loading tidyverse: tidyr
Loading tidyverse: readr
Loading tidyverse: purrr
Loading tidyverse: dplyr
Conflicts with tidy packages ----------------------------------------------------------------------------------------------------------------------------------
filter(): dplyr, stats
lag(): dplyr, stats
dat <- dat %>%
tidyr::gather(key=From, value=value, Alabama,Alaska,Arizona,Arkansas,California,Colorado,Connecticut,Delaware,District\ of\ Columbia,Florida,Georgia,Hawaii,Idaho,Illinois,Indiana,Iowa,Kansas,Kentucky,Louisiana,Maine,Maryland,Massachusetts,Michigan,Minnesota,Mississippi,Missouri,Montana,Nebraska,Nevada,New\ Hampshire,New\ Jersey,New\ Mexico,New\ York,North\ Carolina,North\ Dakota,Ohio,Oklahoma,Oregon,Pennsylvania,Rhode\ Island,South\ Carolina,South\ Dakota,Tennessee,Texas,Utah,Vermont,Virginia,Washington,West\ Virginia,Wisconsin,Wyoming)
エラー: 想定外の入力です in:
"dat <- dat %>%
tidyr::gather(key=From, value=value, Alabama,Alaska,Arizona,Arkansas,California,Colorado,Connecticut,Delaware,District\"
Delete the puctuations.
Then I add two new columns to the above data frame.
d1 <- merge(dat, states, by.x="From", by.y="States")
d1 <- d1[,c(-5,-6)]
names(d1)[4] <- "ID_From"
d2 <- merge(d1, states, by.x="To", by.y="States")
d2 <- d2[,c(-6,-7)]
names(d2)[5] <- "ID_To"
dat <- d2
dat
Sort the data frame in the ascending order
dat <- arrange(dat, ID_From)
dat <- arrange(dat, ID_To)
dat
Convert the above data frame into the matrix form
Drawing the chord diagram
chorddiag(as.matrix(t5),groupColors=states$Color,showTicks=F,groupnamePadding = 20,groupThickness=.05,groupnameFontsize=10)
row names of the 'data' matrix differ from its column names or the 'groupNames' argument.
Q2
まだindividualを入れてないです。 Read “Stops On Lines” and the all GIS data of bus lines.
library(dplyr)
library(sp)
library(rgdal)
library(leaflet)
library(ggmap)
# Bus Stops
SOL <- readOGR("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/BusStops1216","StopsOnLines1216")
SOL.pj <- spTransform(SOL, CRS("+proj=longlat +datum=WGS84"))
# Bus Routes
CC <- readOGR("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/ComCir1216","ComCir1216")
CC.pj <- spTransform(CC, CRS("+proj=longlat +datum=WGS84"))
LE <- readOGR("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/LimExp1216","LimExp1216")
LE.pj <- spTransform(LE, CRS("+proj=longlat +datum=WGS84"))
LCBD <- readOGR("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/LocalCBD1216","LocalCBD1216")
LCBD.pj <- spTransform(LCBD, CRS("+proj=longlat +datum=WGS84"))
LNCBD <- readOGR("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/LocalNonCBD1216","LocalNonCBD1216")
LNCBD.pj <- spTransform(LNCBD, CRS("+proj=longlat +datum=WGS84"))
RBRT <- readOGR("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/RapidBRT1216","RapidBRT1216")
RBRT.pj <- spTransform(RBRT, CRS("+proj=longlat +datum=WGS84"))
# とりあえずindividual2についてのみreadできることを確認。一旦放置してこれ以外でできるか試す。
I2 <- readOGR("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/Individuals1216","2")
I2.pj <- spTransform(I2, CRS("+proj=longlat +datum=WGS84"))
make Line_list
tmp_CC <- geometry(CC.pj)
tmp_LE <- geometry(LE.pj)
tmp_LCBD <- geometry(LCBD.pj)
tmp_LNCBD <- geometry(LNCBD.pj)
tmp_RBRT <- geometry(RBRT.pj)
tmps <- list(tmp_CC, tmp_LE, tmp_LCBD, tmp_LNCBD, tmp_RBRT)
Line_list <- list()
for (i in 1:5){
for (j in 1:length(tmps[[i]])){
Line_list <- c(Line_list, tmps[[i]][j]@lines[[1]]@Lines)
}
}
make new_id
pjs <- list(CC.pj, LE.pj, LCBD.pj, LNCBD.pj, RBRT.pj)
LinLSs <- list()
for (i in 1:5){
LinLSs <- c(LinLSs, sapply(pjs[[i]]@lines, function(x) length(x@Lines)))
}
LinLSs <- LinLSs %>% unlist()
new_id <- sapply(1:length(LinLSs), function(x) paste0(x, "_", seq.int(LinLSs[[x]]))) %>%
unlist()
SLDF <- mapply(function(x, y) Lines(x, ID = y), x = Line_list, y = new_id) %>%
list() %>%
SpatialLines() %>%
SpatialLinesDataFrame(data = DAT)
SpatialLines(.) でエラー:
lines list not exclusively filled with Lines objects
make new lines and LA map
Q3
First I load the data.
library(quantmod)
library(highcharter)
x <- getSymbols("AUD/JPY", src = "oanda", auto.assign = FALSE)
y <- getSymbols("GBP/USD", src = "oanda", auto.assign = FALSE)
Next make Bollinger’s bands for each exchange rate.
x.BBands.ll <- BBands(x)$dn
x.BBands.ul <- BBands(x)$up
x.BBands.m <- BBands(x)$mavg
y.BBands.ll <- BBands(y)$dn
y.BBands.ul <- BBands(y)$up
y.BBands.m <- BBands(y)$mavg
The drawing code is as follows.
hc <- highchart(type="stock") %>%
hc_title(text="Charting Exchange Rates") %>%
hc_subtitle(text = "Data extracted using quantmod package") %>%
hc_yAxis_multiples(
list(top = "0%", height = "50%", offset=0, opposite=TRUE),
list(top = "50%", height = "50%", offset=0, opposite=TRUE)
)%>%
hc_add_series(x, id = "audjpy",name ="audjpy", yAxis=0, color="blue", lineWidth=1.5) %>%
hc_add_series(x.BBands.ll, id = "audjpy.ll", name="audjpy Lower BBands",yAxis=0,
color="black",dashStyle='shortdash', lineWidth=1) %>%
hc_add_series(x.BBands.ul, id = "audjpy.ul", name="audjpy Upper BBands",yAxis=0,
color="black",lineWidth=1) %>%
hc_add_series(x.BBands.m, id = "audjpy.m",name="audjpy BBands MA", yAxis=0,
color="red",lineWidth=1) %>%
hc_add_series(y, id = "gbpusd",name="gbpusd",yAxis=1, color="green", lineWidth=1.5) %>%
hc_add_series(y.BBands.ll, id = "gbpusd.ll",name="gbpusd Lower BBands", yAxis=1,
color="black",dashStyle='shortdash',lineWidth=1) %>%
hc_add_series(y.BBands.ul, id = "gbpusd.ul",name="gbpusd Upper BBands", yAxis=1,
color="black",lineWidth=1) %>%
hc_add_series(y.BBands.m, id = "gbpusd.m",name="gbpusd BBands MA", yAxis=1,
color="red",lineWidth=1) %>%
hc_add_theme(hc_theme_538())
hc
Q4
Load libraries and check the raw data. And make ffdf after converting character columns to factor columns in original df.
library(nycflights13)
library(ffbase)
library(ffbase2)
library(biglm)
library(pROC)
library(chron)
tmp <- flights
tmp$carrier <- as.factor(tmp$carrier)
tmp$tailnum <- as.factor(tmp$tailnum)
tmp$origin <- as.factor(tmp$origin)
tmp$dest <- as.factor(tmp$dest)
flightff <- as.ffdf(tmp)
Next I make new columns as follows
flightff$Delay <- ffifelse(flightff$dep_delay > 0, 1,0)
flightff$DepHour <- flightff$hour
flightff$Car <- ffifelse(flightff$carrier %in% as.factor(c("DL","US","DH","UA")), 1, 0)
flightff$Night <- ffifelse(flightff$hour > 18 | flightff$hour < 6, 1, 0)
flightff$Weekend <- ffifelse(day.of.week(month=flightff$month, day=flightff$day, year=flightff$year) == 6, 1, 0)
I exclude the rows whose Delay values are NA and rename it to logitff. And then I split the dataset into train set and test set.
logitff <- flightff[!is.na(flightff$Delay),]
indx <- ff(1:nrow(logitff))
p <- 0.7
trainIndx <- ff(indx[1:trunc(length(indx)*p)])
trainset <- logitff[trainIndx,]
testIndx <- ff(indx[(trunc(length(indx)*p)+1):length(indx)])
testset <- logitff[testIndx,]
Logistic regression
summary(fit)
Large data regression model: bigglm(Delay ~ DepHour + Car + Night + Weekend, data = trainset,
family = binomial(), sandwich = TRUE)
Sample size = 229964
Coef (95% CI) SE p
(Intercept) -1.8632 -1.8959 -1.8305 0.0164 0
DepHour 0.1108 0.1084 0.1133 0.0012 0
Car -0.1073 -0.1255 -0.0891 0.0091 0
Night -0.2672 -0.2968 -0.2376 0.0148 0
Weekend -0.1737 -0.2025 -0.1450 0.0144 0
Sandwich (model-robust) standard errors
predict and make confusionmatrix in train_set
train_confusion
0 1 Sum
0 122907 19209 142116
1 68525 19323 87848
Sum 191432 38532 229964
predict and make confusionmatrix in test_set
test_confusion
0 1 Sum
0 50824 7149 57973
1 31212 9372 40584
Sum 82036 16521 98557
Draw ROC curve

Q5
---
title: "Big Data Analytics Assignment 1"
output: html_notebook
---

## Q1

First I load the data files
```{r}
dat <- read.csv("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/migration2012.csv")
head(dat)
```

```{r}
states <- read.csv("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/states_chord.csv")
states
```

Next I reshape dat into the handful data format. I use **tidyr package** for the aim instead of **reshape package** because the foramer is the improved version of the latter.

```{r}
install.packages('tidyverse')
library(tidyverse)
```

```{r}
dat <- dat %>%
tidyr::gather(key=From, value=value, Alabama,Alaska,Arizona,Arkansas,California,Colorado,Connecticut,Delaware,District.of.Columbia,Florida,Georgia,Hawaii,Idaho,Illinois,Indiana,Iowa,Kansas,Kentucky,Louisiana,Maine,Maryland,Massachusetts,Michigan,Minnesota,Mississippi,Missouri,Montana,Nebraska,Nevada,New.Hampshire,New.Jersey,New.Mexico,New.York,North.Carolina,North.Dakota,Ohio,Oklahoma,Oregon,Pennsylvania,Rhode.Island,South.Carolina,South.Dakota,Tennessee,Texas,Utah,Vermont,Virginia,Washington,West.Virginia,Wisconsin,Wyoming)
```

Delete the puctuations.

```{r}
dat <- data.frame(apply(dat, 2, function(y) gsub("[[:punct:]]", " ", y)))
dat
```

Then I add two new columns to the above data frame.

```{r}
d1 <- merge(dat, states, by.x="From", by.y="States")
d1 <- d1[,c(-5,-6)]
names(d1)[4] <- "ID_From"

d2 <- merge(d1, states, by.x="To", by.y="States")
d2 <- d2[,c(-6,-7)]
names(d2)[5] <- "ID_To"
dat <- d2
dat
```

Sort the data frame in the ascending order

```{r}
dat <- arrange(dat, ID_From)
dat <- arrange(dat, ID_To)
dat
```

Convert the above data frame into the matrix form

```{r}
t1 <- dat[,c(1,2,4,5)]
t1 <- t1 %>%
  spread(key=To, value=ID_To)

t2 <- dat[,c(1,2,3,4)]
t2 <- t2 %>%
  spread(key=To, value=value)

t3 <- rbind(t1[1,],t2)
t3$From <- as.character(t3$From)
t3[1,2] <- 0
t3[1,1] <- "ID_To"
t3 <- t3 %>%
  arrange(ID_From)

library(data.table)
setcolorder(t3,c("From","ID_From","Connecticut","Maine","Massachusetts","New Hampshire","Rhode Island",
"Vermont","New Jersey","New York","Pennsylvania","Illinois","Indiana",
"Michigan","Ohio","Wisconsin","Iowa","Kansas","Minnesota",
"Missouri","Nebraska","North Dakota","South Dakota","Delaware","Florida",
"Georgia","Maryland","North Carolina","South Carolina","Virginia","District of Columbia",
"West Virginia","Alabama","Kentucky","Mississippi","Tennessee","Arkansas",
"Louisiana","Oklahoma","Texas","Arizona","Colorado","Idaho",
"Montana","Nevada","New Mexico","Utah","Wyoming","Alaska",
"California","Hawaii","Oregon","Washington"))

t3 <- t3[c(-1),c(-2)]

t4 <- t3[,-1]
rownames(t4) <- t3[,1]

t5 <- data.frame(apply(t4, 2, function(y) as.numeric(y)))
rownames(t5) <- t3[,1]
t5
```

Drawing the chord diagram
```{r}
library(chorddiag)

chorddiag(as.matrix(t5),groupColors=states$Color,showTicks=F,groupnamePadding=20,groupThickness=.05,groupnameFontsize=10)
```


## Q2

まだindividualを入れてないです。
Read "Stops On Lines" and the all GIS data of bus lines.
```{r}
library(dplyr)
library(sp)
library(rgdal)
library(leaflet)
library(ggmap)

# Bus Stops
SOL <- readOGR("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/BusStops1216","StopsOnLines1216")
SOL.pj <- spTransform(SOL, CRS("+proj=longlat +datum=WGS84"))

# Bus Routes
CC <- readOGR("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/ComCir1216","ComCir1216")
CC.pj <- spTransform(CC, CRS("+proj=longlat +datum=WGS84"))
LE <- readOGR("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/LimExp1216","LimExp1216")
LE.pj <- spTransform(LE, CRS("+proj=longlat +datum=WGS84"))
LCBD <- readOGR("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/LocalCBD1216","LocalCBD1216")
LCBD.pj <- spTransform(LCBD, CRS("+proj=longlat +datum=WGS84"))
LNCBD <- readOGR("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/LocalNonCBD1216","LocalNonCBD1216")
LNCBD.pj <- spTransform(LNCBD, CRS("+proj=longlat +datum=WGS84"))
RBRT <- readOGR("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/RapidBRT1216","RapidBRT1216")
RBRT.pj <- spTransform(RBRT, CRS("+proj=longlat +datum=WGS84"))

# とりあえずindividual2についてのみreadできることを確認。一旦放置してこれ以外でできるか試す。
I2 <- readOGR("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/Individuals1216","2")
I2.pj <- spTransform(I2, CRS("+proj=longlat +datum=WGS84"))
```

make Line_list
```{r}
tmp_CC <- geometry(CC.pj)
tmp_LE <- geometry(LE.pj)
tmp_LCBD <- geometry(LCBD.pj)
tmp_LNCBD <- geometry(LNCBD.pj)
tmp_RBRT <- geometry(RBRT.pj)
tmps <- list(tmp_CC, tmp_LE, tmp_LCBD, tmp_LNCBD, tmp_RBRT)

Line_list <- list()
for (i in 1:5){
  for (j in 1:length(tmps[[i]])){
    Line_list <- c(Line_list, tmps[[i]][j]@lines[[1]]@Lines)
  }
}
```


make new_id
```{r}
pjs <- list(CC.pj, LE.pj, LCBD.pj, LNCBD.pj, RBRT.pj)
LinLSs <- list()
for (i in 1:5){
 LinLSs <- c(LinLSs, sapply(pjs[[i]]@lines, function(x) length(x@Lines)))
}
LinLSs <- LinLSs %>% unlist()

new_id <- sapply(1:length(LinLSs), function(x) paste0(x, "_", seq.int(LinLSs[[x]]))) %>% 
  unlist()
```

```{r}
## make a new data.frame (only route_id)
DAT=data.frame(matrix(rep(NA,1),nrow=1))[-1,]
for (i in 1:5){
  df <- data.frame(route_id = pjs[[i]]@data$VAR_IDENT)
  DAT <- rbind(DAT, df)
}
rownames(DAT) <- new_id

SLDF <- mapply(function(x, y) Lines(x, ID = y), x = Line_list, y = new_id) %>%
  #list() %>%
  SpatialLines() %>% 
  SpatialLinesDataFrame(data = DAT)
```


make new lines and LA map
```{r}
dat <- geocode('Los Angels')

leaflet() %>%
  setView(lng = dat['lon'], lat = dat['lat']	, zoom = 11) %>%
  addPolylines(data = SLDF, color = "black", opacity = 1, weight = 1) %>% 
  addCircles(data=SOL.pj@data,~LONG, ~LAT, color = "red", weight = 0.3) %>%
  addTiles()
```


## Q3

First I load the data.
```{r}
library(quantmod)
library(highcharter)

x <- getSymbols("AUD/JPY", src = "oanda", auto.assign = FALSE)
y <- getSymbols("GBP/USD", src = "oanda", auto.assign = FALSE)
```

Next make Bollinger's bands for each exchange rate.
```{r}
x.BBands.ll <- BBands(x)$dn
x.BBands.ul <- BBands(x)$up
x.BBands.m <- BBands(x)$mavg
y.BBands.ll <- BBands(y)$dn
y.BBands.ul <- BBands(y)$up
y.BBands.m <- BBands(y)$mavg
```

The drawing code is as follows.
```{r}
hc <- highchart(type="stock") %>% 
  hc_title(text="Charting Exchange Rates") %>% 
  hc_subtitle(text = "Data extracted using quantmod package") %>% 
  hc_yAxis_multiples(
    list(top = "0%", height = "50%", offset=0, opposite=TRUE),
    list(top = "50%", height = "50%", offset=0, opposite=TRUE)
  )%>%
  hc_add_series(x, id = "audjpy",name ="audjpy", yAxis=0, color="blue", lineWidth=1.5) %>%
  hc_add_series(x.BBands.ll, id = "audjpy.ll", name="audjpy Lower BBands",yAxis=0,
                color="black",dashStyle='shortdash', lineWidth=1) %>%
  hc_add_series(x.BBands.ul, id = "audjpy.ul", name="audjpy Upper BBands",yAxis=0,
                color="black",lineWidth=1) %>%
  hc_add_series(x.BBands.m, id = "audjpy.m",name="audjpy BBands MA", yAxis=0,
                color="red",lineWidth=1) %>%
  hc_add_series(y, id = "gbpusd",name="gbpusd",yAxis=1, color="green", lineWidth=1.5) %>%
  hc_add_series(y.BBands.ll, id = "gbpusd.ll",name="gbpusd Lower BBands", yAxis=1,
                color="black",dashStyle='shortdash',lineWidth=1) %>%
  hc_add_series(y.BBands.ul, id = "gbpusd.ul",name="gbpusd Upper BBands", yAxis=1,
                color="black",lineWidth=1) %>%
  hc_add_series(y.BBands.m, id = "gbpusd.m",name="gbpusd BBands MA", yAxis=1,
                color="red",lineWidth=1) %>%
  hc_add_theme(hc_theme_538())

hc
```


## Q4

Load libraries and check the raw data. And make ffdf after converting character columns to factor columns in original df.
```{r}
library(nycflights13)
library(ffbase)
library(ffbase2)
library(biglm)
library(pROC)
library(chron)

tmp <- flights
tmp$carrier <- as.factor(tmp$carrier)
tmp$tailnum <- as.factor(tmp$tailnum)
tmp$origin <- as.factor(tmp$origin)
tmp$dest <- as.factor(tmp$dest)

flightff <- as.ffdf(tmp)
```

Next I make new columns as follows
```{r}
flightff$Delay <- ffifelse(flightff$dep_delay > 0, 1,0)
flightff$DepHour <- flightff$hour
flightff$Car <- ffifelse(flightff$carrier %in% as.factor(c("DL","US","DH","UA")), 1, 0)
flightff$Night <- ffifelse(flightff$hour > 18 | flightff$hour < 6, 1, 0)
flightff$Weekend <- ffifelse(day.of.week(month=flightff$month, day=flightff$day, year=flightff$year) == 6, 1, 0)
```

I exclude the rows whose Delay values are NA and rename it to logitff.
And then I split the dataset into train set and test set.
```{r}
logitff <- flightff[!is.na(flightff$Delay),]

indx <- ff(1:nrow(logitff))
p <- 0.7
trainIndx <- ff(indx[1:trunc(length(indx)*p)])
trainset <- logitff[trainIndx,]
testIndx <- ff(indx[(trunc(length(indx)*p)+1):length(indx)])
testset <- logitff[testIndx,]
```

Logistic regression 
```{r}
fit <- bigglm.ffdf(Delay~DepHour+Car+Night+Weekend, data = trainset, family=binomial(), sandwich=TRUE)
summary(fit)
```

predict and make confusionmatrix in train_set
```{r}
train_pred <- predict(fit, newdata = trainset, type="response")
train_pred <- ifelse(train_pred>0.5, 1,0)
train_confusion <- table(as.integer(as.data.frame(trainset)$Delay), as.integer(train_pred))
train_confusion <- addmargins(train_confusion)
train_confusion
```

predict and make confusionmatrix in test_set
```{r}
test_pred <- predict(fit, newdata = testset, type="response")
test_pred <- ifelse(test_pred>0.5, 1,0)
test_confusion <- table(as.integer(as.data.frame(testset)$Delay), as.integer(test_pred))
test_confusion <- addmargins(test_confusion)
test_confusion
```

Draw ROC curve
```{r}
test_pred <- predict(fit, newdata = testset, type="response")
roc <- roc(as.integer(as.data.frame(testset)$Delay), as.numeric(test_pred))
plot(roc)
```


## Q5




















